{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nulls in the data set satisfaction_level       0\n",
      "last_evaluation          0\n",
      "number_project           0\n",
      "average_montly_hours     0\n",
      "time_spend_company       0\n",
      "Work_accident            0\n",
      "left                     0\n",
      "promotion_last_5years    0\n",
      "sales                    0\n",
      "salary                   0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "hr_data = pd.read_csv('data/hr.csv', header=0)\n",
    "print('Nulls in the data set' ,hr_data.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "New nulls in the data set satisfaction_level          0\n",
      "last_evaluation             0\n",
      "number_project              0\n",
      "average_montly_hours        0\n",
      "time_spend_company          0\n",
      "Work_accident               0\n",
      "left                        0\n",
      "promotion_last_5years       0\n",
      "sales                    4140\n",
      "salary                   7316\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "#As there are no null introduce some nulls by replacing sales in sales column with NaN\n",
    "\n",
    "hr_data[['sales']] = hr_data[[ 'sales']].replace('sales', np.NaN)\n",
    "#As there are no null introduce some nulls by replacing low in salary column with NaN\n",
    "hr_data[['salary']] = hr_data[[ 'salary']].replace('low', np.NaN)\n",
    "\n",
    "print('New nulls in the data set' ,hr_data.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "satisfaction_level       0\n",
      "last_evaluation          0\n",
      "number_project           0\n",
      "average_montly_hours     0\n",
      "time_spend_company       0\n",
      "Work_accident            0\n",
      "left                     0\n",
      "promotion_last_5years    0\n",
      "sales                    0\n",
      "salary                   0\n",
      "dtype: int64\n",
      "   satisfaction_level  last_evaluation  number_project  average_montly_hours  \\\n",
      "0                0.38             0.53               2                   157   \n",
      "1                0.80             0.86               5                   262   \n",
      "2                0.11             0.88               7                   272   \n",
      "3                0.72             0.87               5                   223   \n",
      "4                0.37             0.52               2                   159   \n",
      "\n",
      "   time_spend_company  Work_accident  left  promotion_last_5years      sales  \\\n",
      "0                   3              0     1                      0  technical   \n",
      "1                   6              0     1                      0  technical   \n",
      "2                   4              0     1                      0  technical   \n",
      "3                   5              0     1                      0  technical   \n",
      "4                   3              0     1                      0  technical   \n",
      "\n",
      "   salary  \n",
      "0  medium  \n",
      "1  medium  \n",
      "2  medium  \n",
      "3  medium  \n",
      "4  medium  \n"
     ]
    }
   ],
   "source": [
    "#Replace mode for missing values\n",
    "hr_data_1 = hr_data.copy()\n",
    "# fill missing values with mode column values\n",
    "for column in hr_data_1.columns:\n",
    "    hr_data_1[column].fillna(hr_data_1[column].mode()[0], inplace=True)\n",
    "# count the number of NaN values in each column\n",
    "print(hr_data_1.isnull().sum())\n",
    "print(hr_data_1.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "satisfaction_level       0\n",
      "last_evaluation          0\n",
      "number_project           0\n",
      "average_montly_hours     0\n",
      "time_spend_company       0\n",
      "Work_accident            0\n",
      "left                     0\n",
      "promotion_last_5years    0\n",
      "sales                    0\n",
      "salary                   0\n",
      "dtype: int64\n",
      "   satisfaction_level  last_evaluation  number_project  average_montly_hours  \\\n",
      "0                0.38             0.53               2                   157   \n",
      "1                0.80             0.86               5                   262   \n",
      "2                0.11             0.88               7                   272   \n",
      "3                0.72             0.87               5                   223   \n",
      "4                0.37             0.52               2                   159   \n",
      "\n",
      "   time_spend_company  Work_accident  left  promotion_last_5years sales  \\\n",
      "0                   3              0     1                      0   AAA   \n",
      "1                   6              0     1                      0   AAA   \n",
      "2                   4              0     1                      0   AAA   \n",
      "3                   5              0     1                      0   AAA   \n",
      "4                   3              0     1                      0   AAA   \n",
      "\n",
      "   salary  \n",
      "0     AAA  \n",
      "1  medium  \n",
      "2  medium  \n",
      "3     AAA  \n",
      "4     AAA  \n"
     ]
    }
   ],
   "source": [
    "#Mark global constant for missing values\n",
    "hr_data_2 = hr_data.copy()\n",
    "# fill missing values with global constant values\n",
    "hr_data_2.fillna('AAA', inplace=True)\n",
    "# count the number of NaN values in each column\n",
    "print(hr_data_2.isnull().sum())\n",
    "print(hr_data_2.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
